GO-PCA demo: DMAP

Author: Florian Wagner
Email: florian.wagner@duke.edu

In [1]:
# print version numbers of `gopca` and `genometools` packages used to run this notebook
import gopca
import genometools

print('gopca version:', gopca.__version__)
print('genometools version:', genometools.__version__)


[2016-12-16 13:53:35] INFO: Starting new HTTPS connection (1): api.plot.ly
gopca version: 0.2.0
genometools version: 0.2.1

Configuration


In [2]:
import os

download_dir = 'download'  # downloaded files will be stored here

#gene2accession_url = 'https://www.dropbox.com/s/bemduo72bu9fe6f/gene2accession_2016-01-18_human.tsv.gz?dl=1'
#gene2accession_file = os.path.join(download_dir, 'gene2accession_2016-01-18_human.tsv.gz')

#gene_annotation_url = 'ftp://ftp.ensembl.org/pub/release-83/gtf/homo_sapiens/Homo_sapiens.GRCh38.83.gtf.gz'
#gene_annotation_file = os.path.join(download_dir, 'Homo_sapiens.GRCh38.83.gtf.gz')

gene_ontology_url = 'http://viewvc.geneontology.org/viewvc/GO-SVN/ontology-releases/2016-01-18/go-basic.obo'
gene_ontology_file = os.path.join(download_dir, 'go-basic_2016-01-18.obo')

#go_annotation_url = 'ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/old/HUMAN/gene_association.goa_human.153.gz'
#go_annotation_file = os.path.join(download_dir, 'gene_association.goa_human.153.gz')

go_gene_sets_url = 'https://www.dropbox.com/s/s9osj0lfnoonjtt/GO_gene_sets_human_ensembl83_goa153_ontology2016-01-18.tsv?dl=1'
go_gene_sets_file = os.path.join(download_dir, 'GO_gene_sets_human.tsv')

gene_expression_url = 'https://www.dropbox.com/s/obn0imd623yul7i/dmap_expression_mapped.tsv?dl=1'
gene_expression_file = os.path.join(download_dir, 'dmap_expression.tsv')

Setup


In [3]:
# make sure the notebook uses the entire width of the screen
from IPython.core.display import HTML, display
display(HTML("""
    <style>
        .container { width:95% !important; }
    </style>"""))

# set up plotting with plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
init_notebook_mode()  # embed plotly graphs in the notebook

import genometools
from genometools import misc
import gopca

logger = misc.get_logger()


Download files


In [4]:
import os
from genometools import misc

# create the download directory if necessary
misc.make_sure_dir_exists(download_dir)

# download gene2accession file
#if not os.path.isfile(gene2accession_file):
#    misc.http_download(gene2accession_url, gene2accession_file)

# download Ensembl gene annotations
#if not os.path.isfile(gene_annotation_file):
#    misc.ftp_download(gene_annotation_url, gene_annotation_file)

# download Gene Ontology
if not os.path.isfile(gene_ontology_file):
    misc.http_download(gene_ontology_url, gene_ontology_file)

# download GO annotations
#if not os.path.isfile(go_annotation_file):
#    misc.ftp_download(go_annotation_url, go_annotation_file)

# download GO-derived gene sets
if not os.path.isfile(go_gene_sets_file):
    misc.http_download(go_gene_sets_url, go_gene_sets_file)
    
# download gene expression data
if not os.path.isfile(gene_expression_file):
    misc.http_download(gene_expression_url, gene_expression_file)


[2016-12-16 13:53:36] INFO: Starting new HTTP connection (1): viewvc.geneontology.org
[2016-12-16 13:53:40] INFO: Downloaded file "download/go-basic_2016-01-18.obo".
[2016-12-16 13:53:40] INFO: Starting new HTTPS connection (1): www.dropbox.com
[2016-12-16 13:53:41] INFO: Starting new HTTPS connection (1): dl.dropboxusercontent.com
[2016-12-16 13:53:47] INFO: Downloaded file "download/GO_gene_sets_human.tsv".
[2016-12-16 13:53:47] INFO: Starting new HTTPS connection (1): www.dropbox.com
[2016-12-16 13:53:47] INFO: Starting new HTTPS connection (1): dl.dropboxusercontent.com
[2016-12-16 13:53:49] INFO: Downloaded file "download/dmap_expression.tsv".

Prepare input data


In [5]:
from genometools.basic import GeneSetCollection
from genometools.ontology import GeneOntology
from genometools.expression import ExpMatrix

# read the expression data
matrix = ExpMatrix.read_tsv(gene_expression_file)

# read gene sets
go_gene_sets = GeneSetCollection.read_tsv(go_gene_sets_file)

# read Gene Ontology
gene_ontology = GeneOntology.read_obo(gene_ontology_file)


[2016-12-16 13:53:52] INFO: Parsed 44180 GO term definitions.
[2016-12-16 13:53:52] INFO: Adding child and part relationships...
[2016-12-16 13:53:53] INFO: Flattening ancestors...
[2016-12-16 13:54:05] INFO: Flattening descendants...

Run GO-PCA


In [6]:
from gopca import GOPCA, GOPCAParams

params = GOPCAParams()  # contains GO-PCA default parameter values

analysis = GOPCA.simple_setup(matrix, params, go_gene_sets, gene_ontology)

run = analysis.run()


[2016-12-16 13:54:17] INFO: Timestamp: 2016-12-16 18:54:17.343777
[2016-12-16 13:54:17] INFO: Size of expression matrix: p=8547 genes x n=211 samples.
[2016-12-16 13:54:17] INFO: Expression matrix hash: 35ffb6e179534a2d43b7a340d8ea1d81
[2016-12-16 13:54:17] INFO: Configuration #1 hash: bf167e628450920c3771bb3663f83d74
[2016-12-16 13:54:17] INFO: Estimating the number of principal components (seed = 0)...
/home/fw36/.local/opt/miniconda2/envs/py3notebook/lib/python3.5/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning:

Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.

[2016-12-16 13:54:26] INFO: The estimated number of non-trivial PCs is 15.
[2016-12-16 13:54:26] INFO: Performing PCA...
[2016-12-16 13:54:26] INFO: Fraction of total variance explained by the first 15 PCs: 80.5%
[2016-12-16 13:54:26] INFO: Generating GO-PCA signatures for configuration 1...
[2016-12-16 13:54:42] INFO: 
[2016-12-16 13:54:42] INFO: ======================================================================
[2016-12-16 13:54:42] INFO: GO-PCA for configuration #1 generated 48 signatures.
[2016-12-16 13:54:42] INFO: ----------------------------------------------------------------------
[2016-12-16 13:54:42] INFO: CC: secretory granule lumen (GO:0034774) [1:13/38, 19@965, p=2.7e-08, e=5.4]
[2016-12-16 13:54:42] INFO: BP: defense response to fungus (GO:0050832) [1:5/8, 6@413, p=9.3e-07, e=15.8]
[2016-12-16 13:54:42] INFO: BP: bicarbonate transport (GO:0015701) [1:5/13, 5@58, p=8.5e-08, e=56.7]
[2016-12-16 13:54:42] INFO: BP: G1/S transition of mitotic cell cycle (GO:0000082) [2:39/123, 39@1064, p=5.5e-08, e=2.7]
[2016-12-16 13:54:42] INFO: BP: hydrogen peroxide catabolic process (GO:0042744) [2:7/9, 7@332, p=2.0e-08, e=31.0]
[2016-12-16 13:54:42] INFO: BP: pos. regulation of T cell activation (GO:0050870) [-2:26/101, 38@840, p=4.9e-13, e=4.5]
[2016-12-16 13:54:42] INFO: BP: type I interferon signal. pathway (GO:0060337) [-2:22/52, 25@1064, p=2.6e-09, e=6.8]
[2016-12-16 13:54:42] INFO: BP: cellular defense response (GO:0006968) [-2:10/41, 26@1059, p=2.4e-13, e=11.8]
[2016-12-16 13:54:42] INFO: BP: natural killer cell activation (GO:0030101) [-2:5/12, 8@535, p=4.3e-07, e=14.5]
[2016-12-16 13:54:42] INFO: CC: T cell receptor complex (GO:0042101) [-2:9/14, 10@604, p=1.4e-08, e=38.2]
[2016-12-16 13:54:42] INFO: CC: cullin-RING ubiquitin ligase complex (GO:0031461) [3:19/64, 21@807, p=9.1e-07, e=4.0]
[2016-12-16 13:54:42] INFO: BP: regulation of leukocyte prolif. (GO:0070663) [-3:16/74, 21@339, p=4.5e-12, e=8.3]
[2016-12-16 13:54:42] INFO: CC: MHC class II protein complex (GO:0042613) [-3:9/11, 9@254, p=5.2e-12, e=113.7]
[2016-12-16 13:54:42] INFO: BP: leukocyte migration (GO:0050900) [4:27/143, 42@1001, p=2.8e-08, e=2.5]
[2016-12-16 13:54:42] INFO: CC: membrane raft (GO:0045121) [4:20/106, 30@662, p=1.2e-09, e=3.7]
[2016-12-16 13:54:42] INFO: BP: toll-like receptor signal. pathway (GO:0002224) [4:22/103, 33@575, p=1.0e-13, e=4.9]
[2016-12-16 13:54:42] INFO: BP: MyD88-dependent toll-like receptor signal. pathway (GO:0002755) [4:19/71, 24@575, p=1.1e-10, e=5.0]
[2016-12-16 13:54:42] INFO: BP: IFN-gamma-mediated signal. pathway (GO:0060333) [4:14/65, 19@386, p=2.7e-10, e=6.9]
[2016-12-16 13:54:42] INFO: BP: humoral immune response (GO:0006959) [4:16/55, 25@1012, p=3.7e-09, e=7.5]
[2016-12-16 13:54:42] INFO: BP: cell redox homeostasis (GO:0045454) [4:14/32, 17@880, p=1.5e-08, e=10.0]
[2016-12-16 13:54:42] INFO: BP: detection of external biotic stimulus (GO:0098581) [4:7/11, 8@359, p=7.4e-09, e=33.5]
[2016-12-16 13:54:42] INFO: MF: MHC class II receptor activity (GO:0032395) [4:6/7, 6@201, p=3.2e-09, e=36.4]
[2016-12-16 13:54:42] INFO: BP: DNA strand elongation involved in DNA replication (GO:0006271) [-4:17/30, 18@1050, p=6.1e-09, e=5.9]
[2016-12-16 13:54:42] INFO: BP: B cell prolif. (GO:0042100) [5:5/10, 5@138, p=9.2e-07, e=31.0]
[2016-12-16 13:54:42] INFO: BP: leukocyte aggregation (GO:0070486) [-5:16/78, 22@310, p=2.5e-13, e=7.9]
[2016-12-16 13:54:42] INFO: BP: platelet degranulation (GO:0002576) [6:19/66, 29@936, p=7.0e-11, e=5.6]
[2016-12-16 13:54:42] INFO: BP: regulation of B cell receptor signal. pathway (GO:0050855) [6:5/7, 6@426, p=2.8e-07, e=23.4]
[2016-12-16 13:54:42] INFO: BP: cytokine production (GO:0001816) [-6:5/46, 13@418, p=9.6e-07, e=5.8]
[2016-12-16 13:54:42] INFO: BP: regulation of wound healing (GO:0061041) [7:9/50, 23@1043, p=2.2e-08, e=5.6]
[2016-12-16 13:54:42] INFO: CC: platelet alpha granule (GO:0031091) [7:17/47, 18@217, p=3.5e-16, e=16.5]
[2016-12-16 13:54:42] INFO: CC: platelet alpha granule membrane (GO:0031092) [7:7/8, 7@368, p=7.6e-09, e=40.5]
[2016-12-16 13:54:42] INFO: BP: tRNA processing (GO:0008033) [8:25/84, 29@1029, p=2.8e-07, e=2.9]
[2016-12-16 13:54:42] INFO: BP: nucleoside phosphate biosynthetic process (GO:1901293) [8:21/76, 27@1065, p=8.1e-07, e=3.3]
[2016-12-16 13:54:42] INFO: BP: regulation of mitotic nuclear division (GO:0007088) [8:17/84, 22@419, p=3.9e-10, e=5.3]
[2016-12-16 13:54:42] INFO: BP: centromere complex assembly (GO:0034508) [8:12/24, 16@1010, p=3.4e-09, e=11.1]
[2016-12-16 13:54:42] INFO: CC: U1 snRNP (GO:0005685) [8:8/10, 9@616, p=2.6e-09, e=12.5]
[2016-12-16 13:54:42] INFO: BP: mitotic sister chromatid segregation (GO:0000070) [8:11/46, 12@174, p=6.7e-10, e=12.8]
[2016-12-16 13:54:42] INFO: CC: condensed chromosome kinetochore (GO:0000777) [8:8/17, 9@87, p=1.8e-13, e=71.8]
[2016-12-16 13:54:42] INFO: BP: neg. regulation of MAPK cascade (GO:0043409) [-8:9/74, 22@721, p=4.3e-07, e=3.5]
[2016-12-16 13:54:42] INFO: BP: defense response to bacterium (GO:0042742) [11:15/59, 17@301, p=8.7e-11, e=8.4]
[2016-12-16 13:54:42] INFO: CC: specific granule (GO:0042581) [11:5/14, 5@85, p=8.4e-07, e=35.9]
[2016-12-16 13:54:42] INFO: BP: cotranslational protein targeting to membrane (GO:0006613) [-11:32/99, 35@1065, p=1.7e-08, e=2.9]
[2016-12-16 13:54:42] INFO: BP: regulation of membrane protein ectodomain prote... (GO:0051043) [12:5/14, 7@299, p=8.6e-07, e=19.3]
[2016-12-16 13:54:42] INFO: BP: platelet aggregation (GO:0070527) [12:9/23, 10@258, p=3.8e-09, e=20.0]
[2016-12-16 13:54:42] INFO: MF: heparin binding (GO:0008201) [-14:5/28, 8@189, p=6.9e-07, e=12.9]
[2016-12-16 13:54:42] INFO: CC: stress fiber (GO:0001725) [15:6/25, 10@347, p=1.4e-07, e=14.0]
[2016-12-16 13:54:42] INFO: BP: inflammatory response (GO:0006954) [-15:21/107, 32@609, p=8.8e-12, e=4.2]
[2016-12-16 13:54:42] INFO: BP: neg. regulation of viral genome replication (GO:0045071) [-15:5/35, 12@461, p=8.6e-07, e=7.4]
[2016-12-16 13:54:42] INFO: ======================================================================
[2016-12-16 13:54:42] INFO: 
[2016-12-16 13:54:42] INFO: This GO-PCA run took 25.58 s.

Plot GO-PCA results


In [7]:
# generate the signature matrix figure
fig = run.sig_matrix.get_figure(width=1200, height=800)
iplot(fig)



In [8]:
# look up the signature in the GO-PCA result
sig = run.sig_matrix.get_signature('DNA strand')  # note that we don't have to specify the full signature name

# generate a heatmap
fig = sig.get_figure(run.sig_matrix)  # we're providing the signature matrix in order to show the samples in the same ordering

iplot(fig)


[2016-12-16 13:54:43] INFO: Ordering samples to match order in signature matrix.

Copyright (c) 2016 Florian Wagner.

This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.